1 /*
2 * Copyright (C) 2009 The Guava Authors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package com.google.common.base;
18
19 import static com.google.common.base.Preconditions.checkArgument;
20 import static com.google.common.base.Preconditions.checkNotNull;
21
22 import com.google.common.annotations.Beta;
23 import com.google.common.annotations.GwtCompatible;
24 import com.google.common.annotations.GwtIncompatible;
25
26 import java.util.ArrayList;
27 import java.util.Collections;
28 import java.util.Iterator;
29 import java.util.LinkedHashMap;
30 import java.util.List;
31 import java.util.Map;
32 import java.util.regex.Matcher;
33 import java.util.regex.Pattern;
34
35 import javax.annotation.CheckReturnValue;
36
37 /**
38 * Extracts non-overlapping substrings from an input string, typically by
39 * recognizing appearances of a <i>separator</i> sequence. This separator can be
40 * specified as a single {@linkplain #on(char) character}, fixed {@linkplain
41 * #on(String) string}, {@linkplain #onPattern regular expression} or {@link
42 * #on(CharMatcher) CharMatcher} instance. Or, instead of using a separator at
43 * all, a splitter can extract adjacent substrings of a given {@linkplain
44 * #fixedLength fixed length}.
45 *
46 * <p>For example, this expression: <pre> {@code
47 *
48 * Splitter.on(',').split("foo,bar,qux")}</pre>
49 *
50 * ... produces an {@code Iterable} containing {@code "foo"}, {@code "bar"} and
51 * {@code "qux"}, in that order.
52 *
53 * <p>By default, {@code Splitter}'s behavior is simplistic and unassuming. The
54 * following expression: <pre> {@code
55 *
56 * Splitter.on(',').split(" foo,,, bar ,")}</pre>
57 *
58 * ... yields the substrings {@code [" foo", "", "", " bar ", ""]}. If this
59 * is not the desired behavior, use configuration methods to obtain a <i>new</i>
60 * splitter instance with modified behavior: <pre> {@code
61 *
62 * private static final Splitter MY_SPLITTER = Splitter.on(',')
63 * .trimResults()
64 * .omitEmptyStrings();}</pre>
65 *
66 * <p>Now {@code MY_SPLITTER.split("foo,,, bar ,")} returns just {@code ["foo",
67 * "bar"]}. Note that the order in which these configuration methods are called
68 * is never significant.
69 *
70 * <p><b>Warning:</b> Splitter instances are immutable. Invoking a configuration
71 * method has no effect on the receiving instance; you must store and use the
72 * new splitter instance it returns instead. <pre> {@code
73 *
74 * // Do NOT do this
75 * Splitter splitter = Splitter.on('/');
76 * splitter.trimResults(); // does nothing!
77 * return splitter.split("wrong / wrong / wrong");}</pre>
78 *
79 * <p>For separator-based splitters that do not use {@code omitEmptyStrings}, an
80 * input string containing {@code n} occurrences of the separator naturally
81 * yields an iterable of size {@code n + 1}. So if the separator does not occur
82 * anywhere in the input, a single substring is returned containing the entire
83 * input. Consequently, all splitters split the empty string to {@code [""]}
84 * (note: even fixed-length splitters).
85 *
86 * <p>Splitter instances are thread-safe immutable, and are therefore safe to
87 * store as {@code static final} constants.
88 *
89 * <p>The {@link Joiner} class provides the inverse operation to splitting, but
90 * note that a round-trip between the two should be assumed to be lossy.
91 *
92 * <p>See the Guava User Guide article on <a href=
93 * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#Splitter">
94 * {@code Splitter}</a>.
95 *
96 * @author Julien Silland
97 * @author Jesse Wilson
98 * @author Kevin Bourrillion
99 * @author Louis Wasserman
100 * @since 1.0
101 */
102 @GwtCompatible(emulated = true)
103 public final class Splitter {
104 private final CharMatcher trimmer;
105 private final boolean omitEmptyStrings;
106 private final Strategy strategy;
107 private final int limit;
108
109 private Splitter(Strategy strategy) {
110 this(strategy, false, CharMatcher.NONE, Integer.MAX_VALUE);
111 }
112
113 private Splitter(Strategy strategy, boolean omitEmptyStrings,
114 CharMatcher trimmer, int limit) {
115 this.strategy = strategy;
116 this.omitEmptyStrings = omitEmptyStrings;
117 this.trimmer = trimmer;
118 this.limit = limit;
119 }
120
121 /**
122 * Returns a splitter that uses the given single-character separator. For
123 * example, {@code Splitter.on(',').split("foo,,bar")} returns an iterable
124 * containing {@code ["foo", "", "bar"]}.
125 *
126 * @param separator the character to recognize as a separator
127 * @return a splitter, with default settings, that recognizes that separator
128 */
129 public static Splitter on(char separator) {
130 return on(CharMatcher.is(separator));
131 }
132
133 /**
134 * Returns a splitter that considers any single character matched by the
135 * given {@code CharMatcher} to be a separator. For example, {@code
136 * Splitter.on(CharMatcher.anyOf(";,")).split("foo,;bar,quux")} returns an
137 * iterable containing {@code ["foo", "", "bar", "quux"]}.
138 *
139 * @param separatorMatcher a {@link CharMatcher} that determines whether a
140 * character is a separator
141 * @return a splitter, with default settings, that uses this matcher
142 */
143 public static Splitter on(final CharMatcher separatorMatcher) {
144 checkNotNull(separatorMatcher);
145
146 return new Splitter(new Strategy() {
147 @Override public SplittingIterator iterator(
148 Splitter splitter, final CharSequence toSplit) {
149 return new SplittingIterator(splitter, toSplit) {
150 @Override int separatorStart(int start) {
151 return separatorMatcher.indexIn(toSplit, start);
152 }
153
154 @Override int separatorEnd(int separatorPosition) {
155 return separatorPosition + 1;
156 }
157 };
158 }
159 });
160 }
161
162 /**
163 * Returns a splitter that uses the given fixed string as a separator. For
164 * example, {@code Splitter.on(", ").split("foo, bar,baz")} returns an
165 * iterable containing {@code ["foo", "bar,baz"]}.
166 *
167 * @param separator the literal, nonempty string to recognize as a separator
168 * @return a splitter, with default settings, that recognizes that separator
169 */
170 public static Splitter on(final String separator) {
171 checkArgument(separator.length() != 0,
172 "The separator may not be the empty string.");
173
174 return new Splitter(new Strategy() {
175 @Override public SplittingIterator iterator(
176 Splitter splitter, CharSequence toSplit) {
177 return new SplittingIterator(splitter, toSplit) {
178 @Override public int separatorStart(int start) {
179 int separatorLength = separator.length();
180
181 positions:
182 for (int p = start, last = toSplit.length() - separatorLength;
183 p <= last; p++) {
184 for (int i = 0; i < separatorLength; i++) {
185 if (toSplit.charAt(i + p) != separator.charAt(i)) {
186 continue positions;
187 }
188 }
189 return p;
190 }
191 return -1;
192 }
193
194 @Override public int separatorEnd(int separatorPosition) {
195 return separatorPosition + separator.length();
196 }
197 };
198 }
199 });
200 }
201
202 /**
203 * Returns a splitter that considers any subsequence matching {@code
204 * pattern} to be a separator. For example, {@code
205 * Splitter.on(Pattern.compile("\r?\n")).split(entireFile)} splits a string
206 * into lines whether it uses DOS-style or UNIX-style line terminators.
207 *
208 * @param separatorPattern the pattern that determines whether a subsequence
209 * is a separator. This pattern may not match the empty string.
210 * @return a splitter, with default settings, that uses this pattern
211 * @throws IllegalArgumentException if {@code separatorPattern} matches the
212 * empty string
213 */
214 @GwtIncompatible("java.util.regex")
215 public static Splitter on(final Pattern separatorPattern) {
216 checkNotNull(separatorPattern);
217 checkArgument(!separatorPattern.matcher("").matches(),
218 "The pattern may not match the empty string: %s", separatorPattern);
219
220 return new Splitter(new Strategy() {
221 @Override public SplittingIterator iterator(
222 final Splitter splitter, CharSequence toSplit) {
223 final Matcher matcher = separatorPattern.matcher(toSplit);
224 return new SplittingIterator(splitter, toSplit) {
225 @Override public int separatorStart(int start) {
226 return matcher.find(start) ? matcher.start() : -1;
227 }
228
229 @Override public int separatorEnd(int separatorPosition) {
230 return matcher.end();
231 }
232 };
233 }
234 });
235 }
236
237 /**
238 * Returns a splitter that considers any subsequence matching a given
239 * pattern (regular expression) to be a separator. For example, {@code
240 * Splitter.onPattern("\r?\n").split(entireFile)} splits a string into lines
241 * whether it uses DOS-style or UNIX-style line terminators. This is
242 * equivalent to {@code Splitter.on(Pattern.compile(pattern))}.
243 *
244 * @param separatorPattern the pattern that determines whether a subsequence
245 * is a separator. This pattern may not match the empty string.
246 * @return a splitter, with default settings, that uses this pattern
247 * @throws java.util.regex.PatternSyntaxException if {@code separatorPattern}
248 * is a malformed expression
249 * @throws IllegalArgumentException if {@code separatorPattern} matches the
250 * empty string
251 */
252 @GwtIncompatible("java.util.regex")
253 public static Splitter onPattern(String separatorPattern) {
254 return on(Pattern.compile(separatorPattern));
255 }
256
257 /**
258 * Returns a splitter that divides strings into pieces of the given length.
259 * For example, {@code Splitter.fixedLength(2).split("abcde")} returns an
260 * iterable containing {@code ["ab", "cd", "e"]}. The last piece can be
261 * smaller than {@code length} but will never be empty.
262 *
263 * <p><b>Exception:</b> for consistency with separator-based splitters, {@code
264 * split("")} does not yield an empty iterable, but an iterable containing
265 * {@code ""}. This is the only case in which {@code
266 * Iterables.size(split(input))} does not equal {@code
267 * IntMath.divide(input.length(), length, CEILING)}. To avoid this behavior,
268 * use {@code omitEmptyStrings}.
269 *
270 * @param length the desired length of pieces after splitting, a positive
271 * integer
272 * @return a splitter, with default settings, that can split into fixed sized
273 * pieces
274 * @throws IllegalArgumentException if {@code length} is zero or negative
275 */
276 public static Splitter fixedLength(final int length) {
277 checkArgument(length > 0, "The length may not be less than 1");
278
279 return new Splitter(new Strategy() {
280 @Override public SplittingIterator iterator(
281 final Splitter splitter, CharSequence toSplit) {
282 return new SplittingIterator(splitter, toSplit) {
283 @Override public int separatorStart(int start) {
284 int nextChunkStart = start + length;
285 return (nextChunkStart < toSplit.length() ? nextChunkStart : -1);
286 }
287
288 @Override public int separatorEnd(int separatorPosition) {
289 return separatorPosition;
290 }
291 };
292 }
293 });
294 }
295
296 /**
297 * Returns a splitter that behaves equivalently to {@code this} splitter, but
298 * automatically omits empty strings from the results. For example, {@code
299 * Splitter.on(',').omitEmptyStrings().split(",a,,,b,c,,")} returns an
300 * iterable containing only {@code ["a", "b", "c"]}.
301 *
302 * <p>If either {@code trimResults} option is also specified when creating a
303 * splitter, that splitter always trims results first before checking for
304 * emptiness. So, for example, {@code
305 * Splitter.on(':').omitEmptyStrings().trimResults().split(": : : ")} returns
306 * an empty iterable.
307 *
308 * <p>Note that it is ordinarily not possible for {@link #split(CharSequence)}
309 * to return an empty iterable, but when using this option, it can (if the
310 * input sequence consists of nothing but separators).
311 *
312 * @return a splitter with the desired configuration
313 */
314 @CheckReturnValue
315 public Splitter omitEmptyStrings() {
316 return new Splitter(strategy, true, trimmer, limit);
317 }
318
319 /**
320 * Returns a splitter that behaves equivalently to {@code this} splitter but
321 * stops splitting after it reaches the limit.
322 * The limit defines the maximum number of items returned by the iterator.
323 *
324 * <p>For example,
325 * {@code Splitter.on(',').limit(3).split("a,b,c,d")} returns an iterable
326 * containing {@code ["a", "b", "c,d"]}. When omitting empty strings, the
327 * omitted strings do no count. Hence,
328 * {@code Splitter.on(',').limit(3).omitEmptyStrings().split("a,,,b,,,c,d")}
329 * returns an iterable containing {@code ["a", "b", "c,d"}.
330 * When trim is requested, all entries, including the last are trimmed. Hence
331 * {@code Splitter.on(',').limit(3).trimResults().split(" a , b , c , d ")}
332 * results in @{code ["a", "b", "c , d"]}.
333 *
334 * @param limit the maximum number of items returns
335 * @return a splitter with the desired configuration
336 * @since 9.0
337 */
338 @CheckReturnValue
339 public Splitter limit(int limit) {
340 checkArgument(limit > 0, "must be greater than zero: %s", limit);
341 return new Splitter(strategy, omitEmptyStrings, trimmer, limit);
342 }
343
344 /**
345 * Returns a splitter that behaves equivalently to {@code this} splitter, but
346 * automatically removes leading and trailing {@linkplain
347 * CharMatcher#WHITESPACE whitespace} from each returned substring; equivalent
348 * to {@code trimResults(CharMatcher.WHITESPACE)}. For example, {@code
349 * Splitter.on(',').trimResults().split(" a, b ,c ")} returns an iterable
350 * containing {@code ["a", "b", "c"]}.
351 *
352 * @return a splitter with the desired configuration
353 */
354 @CheckReturnValue
355 public Splitter trimResults() {
356 return trimResults(CharMatcher.WHITESPACE);
357 }
358
359 /**
360 * Returns a splitter that behaves equivalently to {@code this} splitter, but
361 * removes all leading or trailing characters matching the given {@code
362 * CharMatcher} from each returned substring. For example, {@code
363 * Splitter.on(',').trimResults(CharMatcher.is('_')).split("_a ,_b_ ,c__")}
364 * returns an iterable containing {@code ["a ", "b_ ", "c"]}.
365 *
366 * @param trimmer a {@link CharMatcher} that determines whether a character
367 * should be removed from the beginning/end of a subsequence
368 * @return a splitter with the desired configuration
369 */
370 // TODO(kevinb): throw if a trimmer was already specified!
371 @CheckReturnValue
372 public Splitter trimResults(CharMatcher trimmer) {
373 checkNotNull(trimmer);
374 return new Splitter(strategy, omitEmptyStrings, trimmer, limit);
375 }
376
377 /**
378 * Splits {@code sequence} into string components and makes them available
379 * through an {@link Iterator}, which may be lazily evaluated. If you want
380 * an eagerly computed {@link List}, use {@link #splitToList(CharSequence)}.
381 *
382 * @param sequence the sequence of characters to split
383 * @return an iteration over the segments split from the parameter.
384 */
385 public Iterable<String> split(final CharSequence sequence) {
386 checkNotNull(sequence);
387
388 return new Iterable<String>() {
389 @Override public Iterator<String> iterator() {
390 return splittingIterator(sequence);
391 }
392 @Override public String toString() {
393 return Joiner.on(", ")
394 .appendTo(new StringBuilder().append('['), this)
395 .append(']')
396 .toString();
397 }
398 };
399 }
400
401 private Iterator<String> splittingIterator(CharSequence sequence) {
402 return strategy.iterator(this, sequence);
403 }
404
405 /**
406 * Splits {@code sequence} into string components and returns them as
407 * an immutable list. If you want an {@link Iterable} which may be lazily
408 * evaluated, use {@link #split(CharSequence)}.
409 *
410 * @param sequence the sequence of characters to split
411 * @return an immutable list of the segments split from the parameter
412 * @since 15.0
413 */
414 @Beta
415 public List<String> splitToList(CharSequence sequence) {
416 checkNotNull(sequence);
417
418 Iterator<String> iterator = splittingIterator(sequence);
419 List<String> result = new ArrayList<String>();
420
421 while (iterator.hasNext()) {
422 result.add(iterator.next());
423 }
424
425 return Collections.unmodifiableList(result);
426 }
427
428 /**
429 * Returns a {@code MapSplitter} which splits entries based on this splitter,
430 * and splits entries into keys and values using the specified separator.
431 *
432 * @since 10.0
433 */
434 @CheckReturnValue
435 @Beta
436 public MapSplitter withKeyValueSeparator(String separator) {
437 return withKeyValueSeparator(on(separator));
438 }
439
440 /**
441 * Returns a {@code MapSplitter} which splits entries based on this splitter,
442 * and splits entries into keys and values using the specified separator.
443 *
444 * @since 14.0
445 */
446 @CheckReturnValue
447 @Beta
448 public MapSplitter withKeyValueSeparator(char separator) {
449 return withKeyValueSeparator(on(separator));
450 }
451
452 /**
453 * Returns a {@code MapSplitter} which splits entries based on this splitter,
454 * and splits entries into keys and values using the specified key-value
455 * splitter.
456 *
457 * @since 10.0
458 */
459 @CheckReturnValue
460 @Beta
461 public MapSplitter withKeyValueSeparator(Splitter keyValueSplitter) {
462 return new MapSplitter(this, keyValueSplitter);
463 }
464
465 /**
466 * An object that splits strings into maps as {@code Splitter} splits
467 * iterables and lists. Like {@code Splitter}, it is thread-safe and
468 * immutable.
469 *
470 * @since 10.0
471 */
472 @Beta
473 public static final class MapSplitter {
474 private static final String INVALID_ENTRY_MESSAGE =
475 "Chunk [%s] is not a valid entry";
476 private final Splitter outerSplitter;
477 private final Splitter entrySplitter;
478
479 private MapSplitter(Splitter outerSplitter, Splitter entrySplitter) {
480 this.outerSplitter = outerSplitter; // only "this" is passed
481 this.entrySplitter = checkNotNull(entrySplitter);
482 }
483
484 /**
485 * Splits {@code sequence} into substrings, splits each substring into
486 * an entry, and returns an unmodifiable map with each of the entries. For
487 * example, <code>
488 * Splitter.on(';').trimResults().withKeyValueSeparator("=>")
489 * .split("a=>b ; c=>b")
490 * </code> will return a mapping from {@code "a"} to {@code "b"} and
491 * {@code "c"} to {@code b}.
492 *
493 * <p>The returned map preserves the order of the entries from
494 * {@code sequence}.
495 *
496 * @throws IllegalArgumentException if the specified sequence does not split
497 * into valid map entries, or if there are duplicate keys
498 */
499 public Map<String, String> split(CharSequence sequence) {
500 Map<String, String> map = new LinkedHashMap<String, String>();
501 for (String entry : outerSplitter.split(sequence)) {
502 Iterator<String> entryFields = entrySplitter.splittingIterator(entry);
503
504 checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
505 String key = entryFields.next();
506 checkArgument(!map.containsKey(key), "Duplicate key [%s] found.", key);
507
508 checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
509 String value = entryFields.next();
510 map.put(key, value);
511
512 checkArgument(!entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
513 }
514 return Collections.unmodifiableMap(map);
515 }
516 }
517
518 private interface Strategy {
519 Iterator<String> iterator(Splitter splitter, CharSequence toSplit);
520 }
521
522 private abstract static class SplittingIterator extends AbstractIterator<String> {
523 final CharSequence toSplit;
524 final CharMatcher trimmer;
525 final boolean omitEmptyStrings;
526
527 /**
528 * Returns the first index in {@code toSplit} at or after {@code start}
529 * that contains the separator.
530 */
531 abstract int separatorStart(int start);
532
533 /**
534 * Returns the first index in {@code toSplit} after {@code
535 * separatorPosition} that does not contain a separator. This method is only
536 * invoked after a call to {@code separatorStart}.
537 */
538 abstract int separatorEnd(int separatorPosition);
539
540 int offset = 0;
541 int limit;
542
543 protected SplittingIterator(Splitter splitter, CharSequence toSplit) {
544 this.trimmer = splitter.trimmer;
545 this.omitEmptyStrings = splitter.omitEmptyStrings;
546 this.limit = splitter.limit;
547 this.toSplit = toSplit;
548 }
549
550 @Override protected String computeNext() {
551 /*
552 * The returned string will be from the end of the last match to the
553 * beginning of the next one. nextStart is the start position of the
554 * returned substring, while offset is the place to start looking for a
555 * separator.
556 */
557 int nextStart = offset;
558 while (offset != -1) {
559 int start = nextStart;
560 int end;
561
562 int separatorPosition = separatorStart(offset);
563 if (separatorPosition == -1) {
564 end = toSplit.length();
565 offset = -1;
566 } else {
567 end = separatorPosition;
568 offset = separatorEnd(separatorPosition);
569 }
570 if (offset == nextStart) {
571 /*
572 * This occurs when some pattern has an empty match, even if it
573 * doesn't match the empty string -- for example, if it requires
574 * lookahead or the like. The offset must be increased to look for
575 * separators beyond this point, without changing the start position
576 * of the next returned substring -- so nextStart stays the same.
577 */
578 offset++;
579 if (offset >= toSplit.length()) {
580 offset = -1;
581 }
582 continue;
583 }
584
585 while (start < end && trimmer.matches(toSplit.charAt(start))) {
586 start++;
587 }
588 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
589 end--;
590 }
591
592 if (omitEmptyStrings && start == end) {
593 // Don't include the (unused) separator in next split string.
594 nextStart = offset;
595 continue;
596 }
597
598 if (limit == 1) {
599 // The limit has been reached, return the rest of the string as the
600 // final item. This is tested after empty string removal so that
601 // empty strings do not count towards the limit.
602 end = toSplit.length();
603 offset = -1;
604 // Since we may have changed the end, we need to trim it again.
605 while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
606 end--;
607 }
608 } else {
609 limit--;
610 }
611
612 return toSplit.subSequence(start, end).toString();
613 }
614 return endOfData();
615 }
616 }
617 }